# ==============================================================================
# file name: 00_create-regex-bevec-vehovar.R
# date:	Nov 24, 2022
# author: Bernhard Clemm / Tiago Ventura 
# purpose: Create regex patterns from Bevec & Vehovar (2021) questionnaire sites
# ==============================================================================

# INPUT ========================================================================

# This is the comprehensive list by Bevec & Vehovar (2021) (see their Table 12 on p.7)
# ... from which we filtered out irrelevant types (e.g., “UX tool”) 
# ... and manually verified the web addresses connected, as well as finding
# ... alternative/additional hosts for a tool & to make sure that there are 
# ... no other reasons (e.g. email) for subject to visit.

url_matches <- read.csv("data/raw_to_processed/url_hosts/bevec_url_matches.csv")

# CREATE REGEX =================================================================

# We match the list with regular expressions against the data because
# ... subdomains often vary within a tool, e.g., there will be "nyu.qualtrics.com" and "uva.qualtrics.com"
# ... but matching to domain only, i.e., "qualtrics.com", does not work because for some tools, 
# ... only the subdomain is what we want (e.g., survey.zoho.com but not mail.zoho.com)

url_matches <- url_matches %>%
  select(company, contains("match_url")) %>%
  pivot_longer(cols = contains("match_url"), names_to = "number", values_to = "match") %>%
  filter(match != "")

# regex approach: match URL hosts should 
# (1) end on the pattern, e.g. pattern "paperform.co" should not match a host "paperform.com"
# (2) either (a) begin with the pattern, or 
# (b) have a "." before the pattern, e.g. "nyu.qualtrics.com" should match but not "fakequaltrics.com"

# Create string of match patterns
match_patterns <- paste0(
  # Matches beginning with pattern (2a)
  "^", paste(url_matches$match, collapse = "$|^"), "$|",
  # Matches with "." before the pattern
  "\\.", paste(url_matches$match, collapse = "$|\\."), "$")

# Export this string
write.table(match_patterns, paste0("data/raw_to_processed/url_hosts/bevec_url_matches_patterns.txt"),
            row.names = F, col.names = F)
